import pandas as pd   
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt 
  
# Load the dataset  
credit_customers = pd.read_csv("credit_customers.csv")  
  
# Extract the important columns
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']
data_for_clustering = credit_customers[important_columns].copy()

# Apply Label Encoding to 'savings_status' and 'employment'
le_savings_status = LabelEncoder()
le_employment = LabelEncoder()

data_for_clustering['savings_status'] = le_savings_status.fit_transform(data_for_clustering['savings_status'])
data_for_clustering['employment'] = le_employment.fit_transform(data_for_clustering['employment'])

# Apply One-Hot Encoding to 'credit_history'
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)

# Normalize the data using Standard Scaling
scaler = StandardScaler()
data_for_clustering_scaled = scaler.fit_transform(data_for_clustering)
# Determine the sum of squared distances for different number of clusters
# Calculate silhouette scores for different number of clusters
silhouette_scores = []

for cluster_num in range(2, 15):  # Starting from 2 because silhouette score is not defined for 1 cluster
    kmeans = KMeans(n_clusters=cluster_num, random_state=42)
    cluster_labels = kmeans.fit_predict(data_for_clustering_scaled)
    silhouette_avg = silhouette_score(data_for_clustering_scaled, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(range(2, 15), silhouette_scores, marker='o', linestyle='--')
plt.title('Silhouette Scores for Different Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.savefig('ref_result/Silhouette_Scores.png') 
plt.show()